beers = read.csv('data/Beers.csv')
breweries = read.csv('data/Breweries.csv')
str(beers)
## 'data.frame': 2410 obs. of 7 variables:
## $ Name : Factor w/ 2305 levels "#001 Golden Amber Lager",..: 1638 577 1705 1842 1819 268 1160 758 1093 486 ...
## $ Beer_ID : int 1436 2265 2264 2263 2262 2261 2260 2259 2258 2131 ...
## $ ABV : num 0.05 0.066 0.071 0.09 0.075 0.077 0.045 0.065 0.055 0.086 ...
## $ IBU : int NA NA NA NA NA NA NA NA NA NA ...
## $ Brewery_id: int 409 178 178 178 178 178 178 178 178 178 ...
## $ Style : Factor w/ 100 levels "","Abbey Single Ale",..: 19 18 16 12 16 80 18 22 18 12 ...
## $ Ounces : num 12 12 12 12 12 12 12 12 12 12 ...
str(breweries)
## 'data.frame': 558 obs. of 4 variables:
## $ Brew_ID: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : Factor w/ 551 levels "10 Barrel Brewing Company",..: 355 12 266 319 201 136 227 477 59 491 ...
## $ City : Factor w/ 384 levels "Abingdon","Abita Springs",..: 228 200 122 299 300 62 91 48 152 136 ...
## $ State : Factor w/ 51 levels " AK"," AL"," AR",..: 24 18 20 5 5 41 6 23 23 23 ...
How many breweries are present in each state?
# Option 1
# breweriesState = breweries %>%
# group_by(State) %>%
# summarise(Breweries = n()) %>%
# ungroup()
breweriesState = breweries %>% count(State) # Option 2
DT::datatable(breweriesState,rownames = F)
str(breweriesState)
## Classes 'tbl_df', 'tbl' and 'data.frame': 51 obs. of 2 variables:
## $ State: Factor w/ 51 levels " AK"," AL"," AR",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ n : int 7 3 2 11 39 47 8 1 2 15 ...
# https://rstudio.github.io/leaflet/map_widget.html
mapStates = map("state", fill = TRUE, plot = FALSE)
str(mapStates)
## List of 4
## $ x : num [1:15599] -87.5 -87.5 -87.5 -87.5 -87.6 ...
## $ y : num [1:15599] 30.4 30.4 30.4 30.3 30.3 ...
## $ range: num [1:4] -124.7 -67 25.1 49.4
## $ names: chr [1:63] "alabama" "arizona" "arkansas" "california" ...
## - attr(*, "class")= chr "map"
mapStates$names
## [1] "alabama" "arizona"
## [3] "arkansas" "california"
## [5] "colorado" "connecticut"
## [7] "delaware" "district of columbia"
## [9] "florida" "georgia"
## [11] "idaho" "illinois"
## [13] "indiana" "iowa"
## [15] "kansas" "kentucky"
## [17] "louisiana" "maine"
## [19] "maryland" "massachusetts:martha's vineyard"
## [21] "massachusetts:main" "massachusetts:nantucket"
## [23] "michigan:north" "michigan:south"
## [25] "minnesota" "mississippi"
## [27] "missouri" "montana"
## [29] "nebraska" "nevada"
## [31] "new hampshire" "new jersey"
## [33] "new mexico" "new york:manhattan"
## [35] "new york:main" "new york:staten island"
## [37] "new york:long island" "north carolina:knotts"
## [39] "north carolina:main" "north carolina:spit"
## [41] "north dakota" "ohio"
## [43] "oklahoma" "oregon"
## [45] "pennsylvania" "rhode island"
## [47] "south carolina" "south dakota"
## [49] "tennessee" "texas"
## [51] "utah" "vermont"
## [53] "virginia:chesapeake" "virginia:chincoteague"
## [55] "virginia:main" "washington:san juan island"
## [57] "washington:lopez island" "washington:orcas island"
## [59] "washington:whidbey island" "washington:main"
## [61] "west virginia" "wisconsin"
## [63] "wyoming"
# This will not work becasue of the way states are defined; would need major rework
# example state names are "Arizona", but we have "AZ" in out dataframe.
# Also, some states are named "michigan:north", "michigan:south", etc.
# Anyway adding some examples here in case we want to explore this further
leaflet(data = mapStates) %>% addTiles() %>%
addPolygons(fillColor = topo.colors(10, alpha = NULL), stroke = FALSE)
m = leaflet() %>% addTiles()
df = data.frame(
lat = rnorm(100),
lng = rnorm(100),
size = runif(100, 5, 20),
color = sample(colors(), 100)
)
m = leaflet(df) %>% addTiles()
m %>% addCircleMarkers(radius = ~size, color = ~color, fill = FALSE)
## Assuming "lng" and "lat" are longitude and latitude, respectively
m %>% addCircleMarkers(radius = runif(100, 4, 10), color = c('red'))
## Assuming "lng" and "lat" are longitude and latitude, respectively
states <- geojsonio::geojson_read("http://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_outline_20m.json", what = "sp")
class(states)
## [1] "SpatialLinesDataFrame"
## attr(,"package")
## [1] "sp"
names(states)
## [1] "TYPE" "R_STATEFP" "L_STATEFP"
#head(states)
# Not sure yet if I need to get a special token for this.
# Below code is only plotting the boundary of US, not the states
# We can also try with other JSON files. They are available in the same link path as above.
m <- leaflet(states) %>%
setView(-96, 37.8, 4) %>%
addProviderTiles("MapBox", options = providerTileOptions(
id = "mapbox.light",
accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN')))
m %>% addPolygons()
# https://plot.ly/r/choropleth-maps/
df <- read.csv("https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv")
df$hover <- with(df, paste(state, '<br>', "Beef", beef, "Dairy", dairy, "<br>",
"Fruits", total.fruits, "Veggies", total.veggies,
"<br>", "Wheat", wheat, "Corn", corn))
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
p <- plot_geo(df, locationmode = 'USA-states') %>%
add_trace(
z = ~total.exports, text = ~hover, locations = ~code,
color = ~total.exports, colors = 'Purples'
) %>%
colorbar(title = "Millions USD") %>%
layout(
title = '2011 US Agriculture Exports by State<br>(Hover for breakdown)',
geo = g
)
## Warning: package 'bindrcpp' was built under R version 3.4.4
p
Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file
data = merge(beers,breweries,by.x='Brewery_id',by.y='Brew_ID',all=T)
h4('First 6 Observations')
DT::datatable(head(data,6),rownames = F)
h4('Last 6 Observations')
DT::datatable(tail(data,6),rownames = F)
Report the number of NA’s in each column
countNA = sapply(data,function(x){sum(is.na(x))})
kable(countNA,col.names='Count of NA')
| Count of NA | |
|---|---|
| Brewery_id | 0 |
| Name.x | 0 |
| Beer_ID | 0 |
| ABV | 62 |
| IBU | 1005 |
| Style | 0 |
| Ounces | 0 |
| Name.y | 0 |
| City | 0 |
| State | 0 |
Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare
summary = data %>%
group_by(State) %>%
summarise(ABVMedian = median(ABV,na.rm=T)
,IBUMedian = median(IBU,na.rm=T)
) %>%
ungroup()
ggplot(data=filter(summary,!is.na(ABVMedian))
,aes(x=fct_reorder(State,ABVMedian,desc=T)
,y=ABVMedian)) +
geom_col() +
xlab("State") +
ylab("Median Alcohol Content") +
scale_y_continuous(labels=percent)+
coord_flip()
ggplot(data=filter(summary,!is.na(IBUMedian))
,aes(x=fct_reorder(State,IBUMedian,desc=T)
,y=IBUMedian)) +
geom_col() +
xlab("State") +
ylab("Median International Bitterness Unit") +
coord_flip()
Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
message("The State with the maximum alcoholic (ABV) beer is:"
,arrange(data,desc(ABV))$State[1]
)
## The State with the maximum alcoholic (ABV) beer is: CO
message("The State with the most bitter (IBU) beer is:"
,arrange(data,desc(IBU))$State[1]
)
## The State with the most bitter (IBU) beer is: OR
Summary statistics for the ABV variable:
summary(data$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00100 0.05000 0.05600 0.05977 0.06700 0.12800 62
Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot.
ggplot(data=data,aes(x=IBU,y=ABV))+
geom_point() +
geom_smooth(method='lm') +
scale_y_continuous(labels=percent)
model=lm(data=data,formula=ABV~IBU)
model
##
## Call:
## lm(formula = ABV ~ IBU, data = data)
##
## Coefficients:
## (Intercept) IBU
## 0.0449303 0.0003508
We have a very slight positive correlation between ABV and IBU, with a coefficient of 0.0351 of increase % Alcohol per IBU point